import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 加载数据
from sklearn.preprocessing import OneHotEncoder

data = pd.read_csv("data.csv")

# 数据预处理，选择特征
features = ['tag', 'creator', 'contributor', 'title', 'year', 'month', 'bossStatus', 'hot_score']
# 将分类特征转换为字符串类型
for feature in ['tag', 'creator', 'contributor', 'title', 'bossStatus']:
    data[feature] = data[feature].astype(str)
X = data[features].fillna(data['hot_score'].mean())
y = data['score_per'].fillna(data['score_per'].mean())
# 对类别特征进行独热编码
# 对类别特征进行独热编码，设置 handle_unknown='ignore'
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(X)

# 将独热编码后的结果转换为 DataFrame
X_encoded_df = pd.DataFrame(X_encoded.toarray(), columns=encoder.get_feature_names_out())

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_encoded_df, y, test_size=0.2, random_state=42)

# 创建线性回归模型
model = LinearRegression()

# 训练模型
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)
print(y_pred)


sample = {'tag': '都市;爱情;剧情;生活;当代;内地',
          'creator': '王 欢',
          'contributor': "'江疏影', '杨采钰', '张佳宁', '张慧雯', '李浩菲'",
          'title': '唐人街探案2',
          'year': 2024,
          'month': 3,
          'bossStatus': 'FREE',
          'hot_score': 6008}

# 将单个样本转换为DataFrame
sample_df = pd.DataFrame([sample])

# 对特征进行预处理，例如独热编码
sample_encoded = encoder.transform(sample_df)

print(model.predict(sample_encoded), "结果")


# 计算均方误差
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
